/* ubi1024_asm.S */
/*
    This file is part of the AVR-Crypto-Lib.
    Copyright (C) 2009  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
/*
 * \author  Daniel Otte
 * \email   daniel.otte@rub.de
 * \date    2009-03-16
 * \license GPLv3 or later
 */ 

#include "avr-asm-macros.S"

/******************************************************************************/
/*
void ubi1024_init(ubi1024_ctx_t* ctx, const void* g, uint8_t type){
	memset(ctx->tweak, 0, 15);
	ctx->tweak[15] = 0x40+type;
	memcpy(ctx->g, g, UBI1024_BLOCKSIZE_B);
}
*/
/*
 * param ctx:  r24:r25
 * param g:    r22:r23
 * param type: r20
 */
.global ubi1024_init
ubi1024_init:
	movw r26, r24
	ldi r21, 15
1:	st X+, r1
	dec r21
	brne 1b
	ori r20, 0x40
	st X+, r20
	ldi r21, 128
	movw r30, r22
2:  ld r20, Z+
	st X+, r20
	dec r21
	brne 2b
	ret	

/******************************************************************************/
/*
void ubi1024_ctx2hash(void* dest, const ubi1024_ctx_t* ctx){
	memcpy(dest, ctx->g, UBI1024_BLOCKSIZE_B);
}
*/
/*
 * param dest: r24:r24
 * param ctx:  r22:r23
 */
.global ubi1024_ctx2hash
ubi1024_ctx2hash:
	movw r26, r24
	movw r30, r22
	adiw r30, 16
	ldi r22, 128
1:	ld r23, Z+
	st X+, r23
	dec r22
	brne 1b
	ret

/******************************************************************************/
/*
void ubi1024_nextBlock(ubi1024_ctx_t* ctx, const void* block){
	threefish1024_ctx_t tfctx;
	((uint64_t*)(ctx->tweak))[0] += UBI1024_BLOCKSIZE_B;
	threefish1024_init(ctx->g, ctx->tweak, &tfctx);
	memcpy(ctx->g, block, UBI1024_BLOCKSIZE_B);
	threefish1024_enc(ctx->g, &tfctx);
	memxor(ctx->g, block, UBI1024_BLOCKSIZE_B);
	ctx->tweak[15] &= (uint8_t)~0x40;
}
*/
/*
 * param ctx:   r24:r25
 * param block: r22:r23
 */
CTX0   = 2
CTX1   = 3
BLOCK0 = 4
BLOCK1 = 5
TFCTX0 = 6
TFCTX1 = 7 
.global ubi1024_nextBlock
ubi1024_nextBlock:
	stack_alloc_large 20*8
	push_range 2, 7
	adiw r30, 1 /* Z points to tfctx */
	movw TFCTX0, r30
	movw CTX0, r24
	movw BLOCK0, r22
	movw r26, r24
/* add BLOCKSIZE_B (128) to tweak */
	ldi r25, 128
	ld r24, X
	add r24, r25
	st X+, r24
	ldi r25, 11
1:	ld r24, X 
	adc r24, r1
	st X+, r24
	dec r25
	brne 1b
/* call threefish1024_init */	
	movw r24, CTX0
	adiw r24, 16
	movw r22, CTX0
	movw CTX0, r24  /* CTX points to ctx->g */
	movw r20, TFCTX0
	rcall threefish1024_init
	/* copy block to ctx->g */	
	movw r26, CTX0
	movw r30, BLOCK0
	ldi r25, 128
1:	ld r24, Z+
	st X+, r24
	dec r25
	brne 1b
/* call threefish1024_enc */	
	movw r24, CTX0
	movw r22, TFCTX0
	rcall threefish1024_enc
/* xor block into ctx->g */	
	movw r26, BLOCK0
	movw r30, CTX0
	ldi r25, 128
1:	ld r24, X+
	ld r23, Z
	eor r23, r24
	st Z+, r23	
	dec r25
	brne 1b
/* clear 'first' bit in tweak */
	sbiw r30, 1+2	
	sbiw r30, 63
	sbiw r30, 63
	ld r24, Z
	andi r24, ~0x40
	st Z, r24
exit:
	pop_range 2, 7
	stack_free_large2 20*8
	ret

/******************************************************************************/
/*
void ubi1024_lastBlock(ubi1024_ctx_t* ctx, const void* block, uint16_t length_b){
	threefish1024_ctx_t tfctx;
	while(length_b>UBI1024_BLOCKSIZE){
		ubi1024_nextBlock(ctx, block);
		block = (uint8_t*)block + UBI1024_BLOCKSIZE_B;
		length_b -= UBI1024_BLOCKSIZE;
	}
	ctx->tweak[15] |= 0x80;
	((uint64_t*)(ctx->tweak))[0] += (length_b+7)/8;
	if(length_b & 0x07)
		ctx->tweak[14] |= 0x80;
	threefish1024_init(ctx->g, ctx->tweak, &tfctx);
	memset(ctx->g, 0, UBI1024_BLOCKSIZE_B);
	memcpy(ctx->g, block, (length_b+7)/8);
	if(length_b & 0x07)
		ctx->g[(length_b+7)/8-1] |= 0x80>>(length_b&7);
	threefish1024_enc(ctx->g, &tfctx);
	memxor(ctx->g, block, (length_b+7)/8);
	if(length_b & 0x07){
		ctx->g[((length_b+7)/8)-1] ^= 0x80>>(length_b&7);
	}
}  
*/
/*
 * param ctx:     r24:r25
 * param block:   r22:r23
 * param ength_b: r20:r21
 */
MASK_B  =  8 
LEN_B   =  9
TFCTX0  = 10
TFCTX1  = 11
CTX0    = 12
CTX1    = 13
BLOCK0  = 14
BLOCK1  = 15
LENGTH0 = 16
LENGTH1 = 17
.global ubi1024_lastBlock
ubi1024_lastBlock:
/* run nextBlock for preceding blocks*/
	push_range 8, 17
	movw CTX0, r24
	movw BLOCK0, r22
	movw LENGTH0, r20
1:	cpi LENGTH1, 5
	brlo 2f
	movw r24, CTX0
	movw r22, BLOCK0
	rcall ubi1024_nextBlock
	ldi r25, 128
	add BLOCK0, r25
	adc BLOCK1, r1
	subi LENGTH1, 4
	rjmp 1b
2:	cpi LENGTH1, 4
	brlo 3f
	tst LENGTH0
	breq 3f
	movw r24, CTX0
	movw r22, BLOCK0
	rcall ubi1024_nextBlock
	ldi r25, 128
	add BLOCK0, r25
	adc BLOCK1, r1
	subi LENGTH1, 4
3:	/* now the real fun */
    stack_alloc_large 20*8
	adiw r30, 1
	movw TFCTX0, r30
	/* calculate LEN_B */
	movw r24, LENGTH0
	adiw r24, 7
	lsr r25
	ror r24
	lsr r25
	ror r24
	lsr r25
	ror r24
	mov LEN_B, r24
	/* add length to tweak */
	movw r30, CTX0
	ld r24, Z
	add r24, LEN_B
	st Z+, r24
	ldi r25, 11
1:	ld r24, Z
	adc r24, r1
	st Z+, r24
	dec r25
	brne 1b
	/* set 'final' bit*/
	movw r30, CTX0
	ldd r24, Z+15
	ori r24, 0x80
	std Z+15, r24
	/* store in MASK_B if we do bit processing and set 'BitPad' bit*/
	clr MASK_B
	mov r24, LENGTH0
	andi r24, 0x07
	tst r24
	breq 4f
	ldd r25, Z+14
	ori r25, 0x80
	std Z+14, r25
	ldi r25, 0x80
	mov MASK_B, r25
1:	lsr MASK_B
	dec r24
	brne 1b
4:  /* call threefish1024_init*/
	movw r24, CTX0
	adiw r24, 16
	movw r22, CTX0
	movw CTX0, r24 /* CTX points at ctx->g */
	movw r20, TFCTX0
	rcall threefish1024_init
	/* copy block to ctx->g */
	movw r26, BLOCK0
	movw r30, CTX0
	mov r24, LEN_B
	ldi r25, 128
	sub r25, LEN_B
	tst r24
1:	breq 2f
	ld r22, X+
	st Z+, r22
	dec r24	
	rjmp 1b
2:	tst MASK_B
	breq 29f
	or r22, MASK_B
	st -Z, r22
	adiw r30, 1
29:	tst r25
3:	breq 4f
	st Z+, r1
	dec r25
	rjmp 3b
4: /* call threefish1024_enc */
	movw r24, CTX0
	movw r22, TFCTX0
	rcall threefish1024_enc
   /* xor block into ctx->g */
  	movw r30, CTX0
	movw r26, BLOCK0
	tst LEN_B
5:	breq 6f
	ld r22, X+
	ld r23, Z
	eor r23, r22
	st Z+, r23
	dec LEN_B
	rjmp 5b		
6:	tst MASK_B
	breq 7f
	eor r23, MASK_B
	st -Z, r23
	
7:	stack_free_large2 20*8
	pop_range 8, 17
	ret


